import pkuseg
import gensim
import re
import openpyxl
'''
excel_file="C://Users//滑稽鸭阁//Documents//Tencent Files//1160636624//FileRecv//ad表格.xlsx"
wb = openpyxl.load_workbook(excel_file)
sh= wb['工作表1']


re_han= re.compile(u"([、,\u4E00-\u9FD5a-zA-Z]+)")
re_han2= re.compile(u"、，×^/")
pattern = re.compile(r'[\、\，\。\；\>\<\=\;\ \(\)\+\-\×\^\/\%]')
new_word="D://语料//THUOCL-master//data//THUOCL_medical.txt"
'''

seg = pkuseg.pkuseg(model_name='default',user_dict = "D://皮炎参数//ige.txt")
dict_ige = dict()
with open('D://皮炎参数//ige.txt',encoding='utf-8') as f:
    lines = f.readlines()
f.close()
size=0
for line in lines:
    line = line.strip('\n')
    dict_ige[str(line)]=size
    size=size+1
print(dict_ige)
b=[0]*8
strs="免疫全套：免疫球蛋白E 1067.85IU/ml、补体C3 0.7800 g/L、CD4细胞亚群 32.70%、CD8细胞亚群 39.00%、CD4/CD8比值 0.84；"
with open('D://皮炎参数//ige停用词.txt',encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip('\n')
        print(str(line))
        strs=re.sub(str(line),' ',strs)
f.close()
text=seg.cut(strs)
for ch in range(0,len(text)):
    if text[ch] in dict_ige.keys():
        b[dict_ige[text[ch]]-1]=text[ch+1]
print(b)










'''
text=seg.cut(str)
print(text)
#生化：24列
site=0
dict5 = dict()
for i in range(2,202):
    s= str(sh.cell(i,27).value)
    print(s)
    s=re.sub(pattern, r'', s)
    blocks=re_han.split(s)
    for ch in range(0,len(blocks)):
       if u'\u4e00' <= blocks[ch] <= u'\u9fff':#判断是否为中文
         if blocks[ch] not in dict5.keys():
            print(blocks[ch],blocks[ch+1])
            dict5[blocks[ch]]=site
            site+=1
print(dict5)
'''